This Project follows the CRISP-DM process (Cross Industry Process for Data Mining).
CRISP-DM
The coronavirus COVID-19 pandemic is the defining global health crisis affecting 213 countries and territories around the world and 2 international conveyances.
Based on the Novel Coronavirus (COVID-19) Cases, provided by JHU CSSE, I want to find some insights of:
from pathlib import Path
data_dir = Path(r"..\images")
import os
os.listdir(data_dir)
from IPython.display import Image
Image(filename=data_dir/'image-0.PNG')
First Exploratory Data Analysis - EDA
# Import common packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import plotly.graph_objects as go
import plotly.express as px
# Load data
PATH = "C:\\Users\\DucDQ1\\Desktop\\Data-Science-Blog-Post-master\\Data-Science-Blog-Post-master\\datasources\\novel-corona-virus-2019-dataset\\"
df_covid19 = pd.read_csv(PATH + 'covid_19_data.csv', date_parser=['Last Update'])
df_confirmed = pd.read_csv(PATH + 'time_series_covid_19_confirmed.csv')
df_recovered = pd.read_csv(PATH + 'time_series_covid_19_recovered.csv')
df_deaths = pd.read_csv(PATH + 'time_series_covid_19_deaths.csv')
# Check null values data
df_covid19.isnull().any()
# Check with null rows
df_covid19[df_covid19.isnull().any(axis=1)]
df_confirmed.head()
# Earliest cases with the current dataset
df_covid19.head()
What is the first date of case in the dataset: Jan 22, 2020
# Latest cases with the current dataset
df_covid19.tail()
The latest date in the dataset is May 13, 2020
We can now look at the third step of the process:
# Data Cleaning - Columns Renamed
df_covid19.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)
df_covid19['Country'].replace({'Mainland China': 'China'}, inplace=True)
df_confirmed.rename(columns={'Country/Region':'Country'}, inplace=True)
df_recovered.rename(columns={'Country/Region':'Country'}, inplace=True)
df_deaths.rename(columns={'Country/Region':'Country'}, inplace=True)
# The evolution of Covid19 Cases - Confirmed, Deaths, Recovered - Worlwide
confirmed = df_covid19.groupby('Date').sum()['Confirmed'].reset_index()
deaths = df_covid19.groupby('Date').sum()['Deaths'].reset_index()
recovered = df_covid19.groupby('Date').sum()['Recovered'].reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(x=confirmed['Date'],
y=confirmed['Confirmed'],
name='Confirmed',
marker_color='blue'
))
fig.add_trace(go.Bar(x=deaths['Date'],
y=deaths['Deaths'],
name='Deaths',
marker_color='Red'
))
fig.add_trace(go.Bar(x=recovered['Date'],
y=recovered['Recovered'],
name='Recovered',
marker_color='Green'
))
fig.update_layout(
title='The Evolution of COVID-19 Cases - Worldwide',
xaxis_tickfont_size=14,
yaxis=dict(
title='Number of Cases',
titlefont_size=16,
tickfont_size=14,
),
legend=dict(
x=0,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
df_confirmed = df_confirmed[["Province/State","Lat","Long","Country"]]
df_temp = df_covid19.copy()
df_latlong = pd.merge(df_temp, df_confirmed, on=["Country", "Province/State"])
fig = px.density_mapbox(df_latlong,
lat="Lat",
lon="Long",
hover_name="Province/State",
hover_data=["Confirmed","Deaths","Recovered"],
animation_frame="Date",
color_continuous_scale="Portland",
radius=7,
zoom=0, height=700)
fig.update_layout(title='COVID-19 Cases Time Lapse - Worldwide',
font=dict(family="Courier New, monospace",
size=18,
color="#7f7f7f")
)
fig.update_layout(mapbox_style="open-street-map", mapbox_center_lon=0)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
country=df_covid19.groupby(['Country'])[['Confirmed','Recovered','Deaths']].sum()
top_10=country.nlargest(10,['Confirmed'])
plt.figure(figsize=(20,16))
plt.subplot(311)
plt.title('Top 10 Countries with confirmed, recovered and death cases', fontsize=20)
plt.barh(top_10.index,top_10['Confirmed'], color='blue')
plt.yticks(fontsize=20)
plt.xlabel('Confirmed', fontsize=20)
plt.subplot(312)
plt.barh(top_10.index, top_10['Deaths'], color='red')
plt.yticks(fontsize=20)
plt.xlabel('Deaths', fontsize=20)
plt.subplot(313)
plt.barh(top_10.index, top_10['Recovered'], color='green')
plt.yticks(fontsize=20)
plt.xlabel('Recovered', fontsize=20)
data = pd.read_csv(PATH + 'COVID19_line_list_data.csv')
data = data[['id', 'location', 'country', 'gender', 'age', 'death', 'recovered']]
data.head(10)
The age distrition of cases
plt.figure(figsize=(15, 6))
sns.distplot(data['age'], rug=False, bins=50, color='g')
plt.title('Age Distribution')
plt.xlabel("Age");
plt.show()
# By Country - Vietnam
df_vietnam = df_covid19.query('Country=="Vietnam"').groupby("Date")[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()
df_vietnam
plt.figure(figsize=(20,10))
plt.title('COVID-19 Cases Overtime in Vietnam',fontsize=30)
plt.xlabel('Date',fontsize=20)
plt.ylabel('Number of cases',fontsize=20)
plt.plot(df_vietnam.index,df_vietnam['Confirmed'], label='Infected',linewidth=3, color='blue')
plt.plot(df_vietnam.index,df_vietnam['Recovered'], label='Recovered',linewidth=3, color='green')
plt.plot(df_vietnam.index,df_vietnam['Deaths'], label='Deaths', linewidth=3, color='red')
plt.bar(df_vietnam.index,df_vietnam['Confirmed'], alpha=0.2, color='c')
plt.xticks(fontsize=15, rotation=90)
plt.yticks(fontsize=15)
plt.style.use('ggplot')
plt.legend()
When looking at the questions, there is no need to do any predictive modeling. I can use only descriptive and a little inferential statistics to retrieve teh results. Therefore, the step Data Modeling in CRISP-DM is not necessary to answer the questions.
The result is showed in Medium.